*Generates TABLES 3 and 4: Regressions of provider entropy/violations on health outcomes
*Version 15 Stata

set more off

*******************************************************************************
*NOTE: Overview of do-file

*Step 1: clean up IMS entropy by NPI 	
*Step 2: prepare final member-month data: merging in lagged entropy, geo info, 
*Step 3: generate list of patients with valid entropy and violation information
*Step 4: entropy regressions
*Step 5: violation regressions

* --> Final dataset: "adults_sample_series_merged_All.dta" (main dataset for regressions!) 

*******************************************************************************


*Step 1: Clean up IMS entropy by NPI
********************************************************************************
*bring in external data
use Princeton_entropy_master_file.dta, clear

*bring in IQVIA provider ID crosswalk-NPI crosswalk
merge m:1 provider_id using provider_AMA_NPI_matched_KEY.dta
keep if _merge==3
drop _merge

*parse variables
keep npi_str year entropy_molecule_avg

*save file
save Princeton_entropy_any_NPI.dta, replace

*Step 2: Prepare final member-month data
********************************************************************************
*IMPORTANT: this step will take quite some time and may run into out of memory errors;
*	it yields the final dataset "adults_sample_series_merged_treatment_period.dta"

*bring in data (created in do-file gen_monthly_data.do)
use adults_sample_series_merged_All.dta, clear
sort memb_bid_str month_code
tempfile temp
save `temp', replace

* member gender / birth year 
bysort memb_bid_str: egen memb_female=mean(female)
rename brth_yr brth_yr0
bysort memb_bid_str: egen brth_yr=mean(brth_yr0) 

* serv_yr
drop serv_yr serv_month
gen serv_yr=2013*(inrange(month_code,1,12))+2014*(inrange(month_code,13,24))+2015*(inrange(month_code,25,36))+2016*(inrange(month_code,37,45))
gen serv_month=mod(month_code,12)
replace serv_month=12 if serv_month==0 
tab serv_month

* merge with additional facility info (number of hospitalizations)
merge 1:1 memb_bid_str month_code using monthly_FACILITY_adults_sample_ad_supp.dta
rename _merge merge_facil_supp
replace n_hosp=0 if n_hosp==.
replace n_hosp_mh=0 if n_hosp_mh==. 

*merge with additional facility info (ER visits + secondary MH diagnoses)
merge 1:1 memb_bid_str month_code using monthly_FACILITY_adults_sample_ad_supp2.dta
rename _merge merge_facil_supp2

*recode zeros for non-mh records
foreach var of varlist n_admit_ER_mh2 n_admit_urgent_mh2 n_admit_elective_mh2 n_hosp_mh2{
	replace `var'=0 if `var'==. 
}
	
*redefine indicators for ER/hospitalizations
gen i_ER=(n_ER>0 & n_ER!=.)
gen i_ER_mh=(n_ER_mh>0 & n_ER_mh!=.)
gen i_hosp=(n_hosp>0 & n_hosp!=.)
gen i_hosp_mh=(n_hosp_mh>0 & n_hosp_mh!=.) 

gen i_ER_hosp=(i_ER==1 | i_hosp==1)
gen i_ER_hosp_mh=(i_ER_mh==1 | i_hosp_mh==1) 

gen i_ER_mh2=(n_admit_ER_mh2>0 & n_admit_ER_mh2!=.)
gen i_hosp_mh2=(n_hosp_mh2>0 & n_hosp_mh2!=.)
gen i_ER_hosp_mh2=(i_ER_mh2==1 | i_hosp_mh2==1) 

foreach var of varlist i_ER_mh i_hosp_mh i_ER_hosp_mh{
	rename `var' `var'1
	label var `var'1 "by primary ICD9"
	label var `var'2 "by secondary ICD9"
}

gen i_ER_hosp_mh=(i_ER_hosp_mh1==1 | i_ER_hosp_mh2==1) 
label var i_ER_hosp_mh "ER/hosp for mental health - primary and second ICD9"

*bring in 1-month lagged entropy scores (from entropy data created in Step 1 above)
rename npi_str npi_str0 
rename npi npi0 

bysort memb_bid_str: gen npi_str=npi_str0[_n-1] 
bysort memb_bid_str: gen npi=npi0[_n-1] 
bysort memb_bid_str: gen i_lag_phy=i_phy[_n-1]

gen year=serv_yr-1
merge m:1 npi_str year using Princeton_entropy_any_NPI.dta
keep if _merge!=2 
rename _merge merge_entropy1 

* cluster at NPI level -- use a numeric assigned prescriber-ID "i_npi_str"
bysort npi_str: gen uniq_npi=_n
replace uniq_npi=. if npi_str=="" 
count if uniq_npi==1
count if uniq_npi==1 & merge_entropy1==3

replace uniq_npi=. if uniq_npi!=1
sort npi_str
gen i_npi_str=sum(uniq_npi)
sum i_npi_str if npi_str=="" 
replace i_npi_str=. if npi_str==""
sum i_npi_str

*flag if patient seen by psychiatrist -- corresponding to "subset" in the regression do-file 
bysort memb_bid_str: egen sum_i_psych=sum(i_psych)

* merge in patient zip code (created in do-file gen_monthly_data.do)
merge m:1 memb_bid_str using uniq_member_adults_sample_zip.dta
rename _merge merge_county
replace memb_county=0 if merge_county!=3

*compute real costs in Jan-2013 dollars using CPI
rename year prior_year
rename serv_yr year

merge m:1 year month_code using CPI_2013-2016.dta"
drop if _merge!=3
drop _merge

*recode variables
rename year serv_yr 
rename prior_year year

*calculate total real cost
gen real_total_cost=total_cost/(1+monthly_CPI)
gen ln_total_cost=log(1+real_total_cost)

*calculate cost over 3 month period
sort memb_bid_str month_code
bysort memb_bid_str: gen cost1=real_total_cost[_n+1]
bysort memb_bid_str: gen cost2=real_total_cost[_n+2] 
gen three_month_cost=real_total_cost+cost1+cost2
gen ln_three_month_cost=log(1+three_month_cost) 

*calculation non-pharmacy cost
gen non_pharm_cost=(facility_cost+prof_cost)/(1+monthly_CPI)
sort memb_bid_str month_code 
drop cost1 cost2 
bysort memb_bid_str: gen cost1=non_pharm_cost[_n+1]
bysort memb_bid_str: gen cost2=non_pharm_cost[_n+2]
gen three_month_non_pharm=non_pharm_cost+cost1+cost2 
gen ln_non_pharm=log(1+non_pharm_cost)
gen ln_three_month_non_pharm=log(1+three_month_non_pharm) 

*save temporary file
tempfile hold
save `hold', replace

*calculate the average entropy score over the lifetime of treatment between patient-physician
collapse (mean) entropy_molecule_avg, by(memb_bid_str npi_str)

*merge back in rest of the data
merge 1:m memb_bid_str npi_str using `hold'
drop _merge

*save resulting file
save adults_sample_series_merged_treatment_period.dta, replace

*Step 3: Generate final regression sample: must have valid entropy/violation data
********************************************************************************
*bring in full monthly data for entropy regressions
use adults_sample_series_merged_treatment_period.dta, clear
sort memb_bid_str month_code
replace first_month=. if first_month==0 

*if first drug in (t), starts from outcome in (t+1) on characteristics in (t) 
keep if month_code>=first_month+1 
keep if i_full==1 & entropy_molecule!=. & i_npi_str!=.

*keep one observation per patient	
by memb_bid_str, s: keep if _n==1
keep memb_bid_str
tempfile entropy
save `entropy', replace

*bring in full monthly data for violation regressions
use adults_sample_series_merged_treatment_period.dta, clear
sort memb_bid_str month_code
replace first_month=. if first_month==0

local list UK US CA cocktail
foreach var of local list{
bysort memb_bid_str: gen lag_transition_`var'=guide_`var'[_n-1]
label var lag_transition_`var' "`var'"
}
sum lag_transition_* if month_code>=first_month+1 & merge_n_violate==3 

*if first drug in (t), starts from outcome in (t+1) on characteristics in (t) 
keep if month_code>=first_month+1
keep if merge_n_violate==3
keep if lag_transition_US!=.
unique memb_bid

*keep one observation per patient	
by memb_bid_str, s: keep if _n==1
keep memb_bid_str
generate in_transition = 1

*merge in entropy regression sample
merge 1:1 memb_bid_str using `entropy'
generate in_entropy = _merge==2 | _merge==3
replace in_transition = 0 if _merge==2

keep if in_entropy==1 & in_transition==1
keep memb_bid_str

*save resulting list of patients
save overlap_sample.dta, replace

*Step 4: Generate estimates from Table 3
********************************************************************************

*bring in data
use adults_sample_series_merged_treatment_period.dta, clear

*make sure only "overlap" sample (as defined in Step 3 above)
merge m:1 memb_bid using overlap_sample.dta
keep if _merge==3
drop _merge

*apply sample restriction (whereby complete history of entropy)
keep if i_full==1 & entropy_molecule_avg!=. & i_npi_str!=. 

*define subsets of patients (for panels A, B, and C)
generate subset1 = sum_i_psych>0
generate subset2 = sum_i_psych==0
generate subset0 = 1

*count unique members within the sample for regressions
bysort memb_bid_str: gen uniq_memb=_n
count if uniq_memb==1 

*label variables
label variable entropy_molecule_avg "Avg(E_mol) over treatment"

*run regressions (three outer loops of three panels; four inner loops for four outcomes)
forvalues sample=0(1)2 {
preserve

	*define relevant sample
	keep if subset`sample'==1
	local subset subset`sample'

	local entropy entropy_molecule_avg
	
	log using "`entropy'-subset`sample'.log",replace

		foreach dep_var of varlist ln_total_cost ln_non_pharm i_ER_hosp i_ER_hosp_mh {
	
				count
				count if uniq_memb==1 
				
				*Display summary statistics for estimation
				disp "Summary Stats for `dep_var'"
				sum `dep_var' 
				sum `dep_var' if `entropy'!=. & i_npi_str!=.
				quietly return list
				scalar mean_y=`r(mean)'
				bysort memb_bid_str: egen check_`dep_var'=mean(`dep_var')
				count if uniq_memb==1 & check_`dep_var'!=.
				quietly return list
				scalar n_member=`r(N)'
				drop check_`dep_var'
				
				*Odd columns
				disp "Model (A) - cost on entropy and county fixed effects"
				eststo: xi: areg `dep_var' `entropy' memb_female i.brth_yr i.serv_yr i.serv_month, absorb(memb_county) cluster(i_npi_str) 
				estadd local county_FE "Yes",replace
				estadd local member_FE "No",replace
				estadd scalar n_member
				estadd scalar mean_y
				
				*Even columns
				disp "Model (B) - cost on entropy and member fixed effects"
				eststo: xi: areg `dep_var' `entropy' i.serv_yr i.serv_month, absorb(memb_bid_str) cluster(i_npi_str) 
				estadd local county_FE "No",replace
				estadd local member_FE "Yes",replace
				estadd scalar n_member
				estadd scalar mean_y
				
				esttab using `dep_var'-on-`entropy'-subset`sample'.tex", keep(`entropy' _cons) indicate(Birth Year included=_Ibrth_yr*) cells(b(fmt(5)) se(fmt(5) par)) collabels(none) mlabels(none) stats(mean_y county_FE member_FE r2 r2_a F N n_member,fmt(%9.0g) label("Mean(y)" "County FE" "Member FE" "R2" "Adj. R2" "F" "N" "No. Members")) addnotes("Observations are at member-month level." "Restrict the sample to members with Complete records of prescribers' entropy scores over the treatment period." "Standard errors are clustered at the prescriber (NPI) level.") label nostar replace
			
	eststo clear
	}
	
	log close
	restore
}

*Step 5: Generate estimates from Table 4
********************************************************************************
*bring in data
use adults_sample_series_merged_treatment_period.dta, clear

*make sure only "overlap" sample (as defined in Step 3 above)
merge m:1 memb_bid using overlap_sample.dta
keep if _merge==3
drop _merge

*define %lagged scripts that violate each guideline
local list UK US CA cocktail
foreach var of local list{
	bysort memb_bid_str: gen lag_transition_`var'=guide_`var'[_n-1]
	label var lag_transition_`var' "`var'"
} 

*apply sample restriction (exclude first month)
keep if month_code>=first_month+1 
keep if merge_n_violate==3 
keep if lag_transition_US!=. 
	
*define subsets of patients (for panels A, B, and C)
generate subset1 = sum_i_psych>0
generate subset2 = sum_i_psych==0
generate subset0 = 1

*run regressions (three outer loops of three panels; four inner loops for four outcomes)
forvalues sample=0(1)2 {
preserve

	*define relevant sample
	keep if subset`sample'==1
	local subset subset`sample'

	log using transition-subset`sample'.log,replace

	foreach dep_var of varlist ln_total_cost ln_non_pharm i_ER_hosp i_ER_hosp_mh{

		count
		count if uniq_memb==1 
		
		*Display summary statistics for estimation
		disp "Summary Stats for `dep_var'"
		sum `dep_var' 
		quietly return list
		scalar mean_y=`r(mean)'
		bysort memb_bid_str: egen check_`dep_var'=mean(`dep_var')
		count if uniq_memb==1 & check_`dep_var'!=.
		quietly return list
		scalar n_member=`r(N)'
		drop check_`dep_var'

		*Odd columns
		disp "Model (A) - all; county fixed effects"
		eststo: xi: areg `dep_var' lag_transition_UK lag_transition_US lag_transition_CA lag_transition_cocktail memb_female i.brth_yr i.serv_yr i.serv_month, absorb(memb_county) cluster(i_npi_str) 
		estadd local county_FE "Yes",replace
		estadd local member_FE "No",replace
		estadd scalar n_member
		estadd scalar mean_y
		
		*Even columns
		disp "Model (B) - all; patient & county fixed effects"
		eststo: xi: areg `dep_var' lag_transition_UK lag_transition_US lag_transition_CA lag_transition_cocktail i.serv_yr i.serv_month, absorb(memb_bid_str) cluster(i_npi_str) 
		estadd local county_FE "No",replace
		estadd local member_FE "Yes",replace
		estadd scalar n_member
		estadd scalar mean_y
	
		esttab using `dep_var'-on-transition-violate-subset`sample'.tex, keep(lag_transition_UK lag_transition_US lag_transition_CA lag_transition_cocktail memb_female _cons) indicate(Birth Yr=_Ibrth_yr*) cells(b(fmt(5)) se(fmt(5) par)) collabels(none) mlabels(none) stats(mean_y county_FE member_FE r2 r2_a F N n_member,fmt(%9.0g) label("Mean(y)" "County FE" "Member FE" "R2" "Adj. R2" "F" "N" "No. Members")) addnotes("Observations are at member-month level." "Restrict the sample to members with non-missing lagged NPI of prescribers." "Standard errors are clustered at the prescriber (NPI) level.") label nostar replace
	
	eststo clear
	}
	
	log close
	restore
}
